FILENAME: Analysis Notebook.ipynb
PROJECT: Multivariate Financial Forecasting
DATE CREATED:24-APR-20
DATE UPDATED:24-APR-20
TASK: Develop and implement a recurrent neural network
PURPOSE: Given a multivariate dataset, forecast and predict the corresponding response value for each record
INTENT: The purpose of this project is to conduct exploratory analysis of the provided data set and apply both supervised and unsupervised algorithms in order to extract meaniningful information in support for future open source analysis. The project is broken down into two separate projects, with each project having four (4) distinct phases:
PROJECT: Randomized Budget Data
Environment Setup
Data ETL
Data Exploration
Model Development
Create randomm arrays to store the test values:
YEAR +3: yr3_forecast
YEAR +2: yr2_forecast
YEAR +1: yr1_forecast
YEAR +0: plan
YEAR -1: approp
YEAR -2: obligate
from IPython.display import Image
from IPython.core.display import HTML
Image(filename = "data/rnn.png", width=750, height=750)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
Import the necessary libraries needed for ETL, engineering, and export efforts
import pandas as pd
import csv
import random
import sqlite3
import itertools
import numpy as np
import datetime
import time as t
import getpass as gp
Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import geopandas as gpd
import descartes
from shapely.geometry import Point, Polygon
Import the required ML & neural net libraries
from scipy import stats
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor
Declare a function used to return a randomized pandas dataframe
def init_array(df_length):
'''
DESCRIPTION: A function to create and return a two_dimensional array with randomized float values
'''
length = df_length
yr3_forecast= np.random.randint(low = 100000, high = 30000000, size = df_length)
yr2_random = np.random.uniform(low=0.5, high=1.3, size=df_length)
yr2_forecast = np.round(yr3_forecast * yr2_random,2)
yr1_random = np.random.uniform(low=0.8, high=1.2, size=df_length)
yr1_forecast = np.round(yr2_forecast * yr1_random,2)
plan_random = np.random.uniform(low=0.6, high=1.3, size=df_length)
plan_val = np.round(yr1_forecast * plan_random,2)
approp_random = np.random.uniform(low=0.6, high=1.2, size=df_length)
approp_val = np.round(plan_val * approp_random,2)
oblig_random = np.random.uniform(low=0.8, high=1.0, size=df_length)
oblig_val = np.round(approp_val * oblig_random,2)
raw_df = pd.DataFrame(columns=['yr+3_forecast','yr+2_forecast','yr+1_forecast','yr0_plan','yr-1_approp','yr-2_oblig'])
raw_df['yr+3_forecast'] = yr3_forecast
raw_df['yr+2_forecast'] = yr2_forecast
raw_df['yr+1_forecast'] = yr1_forecast
raw_df['yr0_plan'] = plan_val
raw_df['yr-1_approp'] = approp_val
raw_df['yr-2_oblig'] = oblig_val
return raw_df
Start the project timer
program_start = t.time()
Set the random seed for the project in order to ensure consistent results
random.seed(6)
Create random arrays to store the randomized values:
Create the training array
train_df = init_array(10000)
train_df.tail(10)
train_df.head(5)
Create a list of column names which can be used to loop over for future analysis
col_list = list(train_df.columns)
col_list
Create a copy of train_df dataframe
dataset = train_df.copy()
dataset.tail(10)
Plot a box and whisker plot for all of the six variables
import plotly.graph_objects as go
import numpy as np
np.random.seed(1)
y3 = dataset['yr+3_forecast']
y2 = dataset['yr+2_forecast']
y1 = dataset['yr+1_forecast']
plan = dataset['yr0_plan']
approp = dataset['yr-1_approp']
oblig = dataset['yr-2_oblig']
fig = go.Figure()
fig.add_trace(go.Box(x=y2, name = "yr+3_forecast"))
fig.add_trace(go.Box(x=y3, name = "yr+2_forecast"))
fig.add_trace(go.Box(x=y1, name = "yr+1_forecast"))
fig.add_trace(go.Box(x=plan, name = "yr0_plan"))
fig.add_trace(go.Box(x=approp, name = "yr-1_approp"))
fig.add_trace(go.Box(x=oblig, name = "yr-2_oblig"))
fig.show()
Invoke seaborn's pairplot function in order to find any immediate correlation and statistical outliers
budget_pair = train_df[[
'yr+3_forecast',
'yr+2_forecast',
'yr+1_forecast',
'yr0_plan',
'yr-1_approp',
'yr-2_oblig']]
sns.set(style="ticks", color_codes=True)
sns.pairplot(budget_pair)
Convert dataframe to numpy arrays
x=dataset.iloc[:, 0:5].to_numpy()
y=dataset.iloc[:,5].to_numpy()
x
Reshape the response variable
y=np.reshape(y, (-1,1))
y
Scale the data from 0 -> 1
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()
print(scaler_x.fit(x))
xscale=scaler_x.transform(x)
print(scaler_y.fit(y))
yscale=scaler_y.transform(y)
Segregate master data to 'train', 'test', 'split'
X_train, X_test, y_train, y_test = train_test_split(xscale, yscale)
Verify the array shape
X_train.shape
y_train is the response variable
y_train.shape
One input layer for the predictor variables, two hidden layers, and one output node
model = Sequential()
model.add(Dense(10, input_dim=5, kernel_initializer='normal', activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1, activation='linear'))
model.summary()
model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
history = model.fit(X_train, y_train, epochs=250, batch_size=50, verbose=1, validation_split=0.2)
fig = go.Figure()
fig.add_trace(go.Scatter(y=history.history['loss'],
mode='lines',
name='Train'))
fig.add_trace(go.Scatter(y=history.history['val_loss'],
mode='lines+markers',
name='Validation'))
fig.update_layout(
autosize=False,
width=1500,
height=750,
title = "Train vs. Validation Loss Test",
xaxis=dict(
title_text="No. of epochs",
titlefont=dict(size=20),
),
yaxis=dict(
title_text="Loss Value",
titlefont=dict(size=20),
)
)
fig.show()
Create a new array with dummy data and test the model's effeftiveness against it
predict_full = init_array(25000)
valid_df = predict_full.iloc[:,:-1]
valid_df.tail(100)
Convert the dataframe to a two dimensional numpy array
valid_array = valid_df.to_numpy()
valid_array
Validate the shape of the newly created array
valid_array.shape
predict_val = model.predict(valid_array)
predict_val
Merge the numpy predictor array as a standalone column to the predict_full dataframe
predict_full['predict_values'] = predict_val
predict_full.tail(10)
Calculate the difference in actual ('yr0_exe') and predicted ('predict_values') model values and assign a difference value for each record in the 'delta' column
predict_full['delta'] = (predict_full['yr-2_oblig'] - predict_full['predict_values']) / predict_full['yr-2_oblig']
predict_full.tail(10)
Display the histogram of the yr-2_oblig response values
fig = px.histogram(predict_full, x="yr-2_oblig",marginal="rug", # can be `box`, `violin`
hover_data=predict_full.columns, color_discrete_sequence=['orange'], opacity = 0.5)
fig.update_layout(
autosize=True,
title = "yr-2_oblig (actuals) distribution ")
fig.show()
Display the histogram of the predict_values response values
fig = px.histogram(predict_full, x="predict_values",marginal="rug", # can be `box`, `violin`
hover_data=predict_full.columns, color_discrete_sequence=['teal'], opacity = 0.5)
fig.update_layout(
autosize=True,
title = "predict values distribution ")
fig.show()
Plot the distribution of the delta percentage values
fig = px.histogram(predict_full, x="delta",marginal="rug", # can be `box`, `violin`
hover_data=predict_full.columns, color_discrete_sequence=['indianred'], opacity = 0.5)
fig.update_layout(
autosize=True,
title = "Actual vs Prediction value Historgram ")
fig.show()
Display distribution of box & whisker plot for response and predict values
yr5 = dataset['yr-2_oblig']
yr6 = predict_full.predict_values
fig = go.Figure()
fig.add_trace(go.Box(x=yr5, name = "yr-2_oblig (actuals)"))
fig.add_trace(go.Box(x=yr6, name = "predict_values"))
fig.show()
Retrieve the statistical parameters for the linear model
x = predict_full['yr-2_oblig']
y = predict_full['predict_values']
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print(" Slope: {}\n Intercept: {}\n R-squared: {}\n P-Value: {}\n Standard Error: {}". format(slope, intercept, r_value, p_value, std_err))
Plot the response values (original) against the predicted values
fig = px.scatter(predict_full, x="yr-2_oblig", y="predict_values", trendline="ols", opacity=0.25, color_discrete_sequence=['green'])
fig.update_layout(
autosize=False,
width=1000,
height=750,
title = "Response values vs predicted values scatterplot",
xaxis=dict(
title_text="yr-2_oblig values (Response Values)",
titlefont=dict(size=20),
),
yaxis=dict(
title_text="predict_values (Predicted Values)",
titlefont=dict(size=20),
)
)
fig.show()
Import the Virginia shape file
us_shp = gpd.read_file('data/va_shp/tl_2016_51_cousub.shp')
Plot the shape file
fig, ax = plt.subplots(figsize = (30,30))
us_shp.plot(ax = ax)
df_size = len(predict_full)
df_size
Create randomized lat long values
# 38.893732, -77.311721
#36.762558, -78.359682
lat_random = np.random.uniform(low=36.762558, high=38.893732, size=df_size)
long_random = np.random.uniform(low=-77.311721, high=-78.359682, size=df_size)
len(long_random)
temp = predict_full.copy()
geo_df = temp[['yr-2_oblig']]
geo_df['latitude'] = lat_random
geo_df['longitude'] = long_random
geo_df.tail(10)
Create a list of points
geometry = [Point(xy) for xy in zip(geo_df["longitude"],geo_df["latitude"])]
geometry[:3]
# define our coordinate system
crs = {'init': 'epsg:4326'}
Retrieve only the first 150 points
geo_slice_df = geo_df[:150]
geo_slice_df
Create the geo df object to convert lat/long to points
geo_slice = gpd.GeoDataFrame(geo_slice_df,
crs = crs)
geo_slice.to_csv (r'geo_df_export.csv', index = False, header=True)
geo_slice.head()
Plot the figure
fig, ax = plt.subplots(figsize = (20,15))
us_shp.plot(ax = ax, alpha = 0.4, color = "grey")
geo_slice[geo_slice['yr-2_oblig'] > 30000000].plot(ax = ax, markersize = 220, color = "red", marker = "o", label = "Subprojects at Severe High Risk")
geo_slice[(geo_slice['yr-2_oblig'] >=10000000) & (geo_slice['yr-2_oblig'] <20000000)].plot(ax = ax, markersize = 40, color = "orange", marker = "^", label = "Subprojects at High Risk")
#geo_df[(geo_df['yr-2_oblig'] >=5000000) & (geo_df['yr-2_oblig'] <10000000)].plot(ax = ax, markersize = 20, color = "yellow", marker = "^", label = "$Subprojects at Medium Risk")
#geo_df[(geo_df['price'] >=100) & (geo_df['price'] <199)].plot(ax = ax, markersize = 20, color = "green", marker = "^", label = "$100 - $199")
#geo_df[geo_df['price'] < 100].plot(ax = ax, markersize = 20, color = "blue", marker = "^", label = "< $100")
plt.title("SUBPROJECTS AT RISK OF NOT MEETING OBLIGATION RATES")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(prop={'size':12})
fig = px.scatter(geo_df[:50], x="longitude", y="latitude", opacity=0.25, size = "predict_values", color_discrete_sequence=['green'])
fig.update_layout(
autosize=False,
width=1000,
height=750,
title = "US Map of Budget points",
xaxis=dict(
title_text="Longitude",
titlefont=dict(size=20),
),
yaxis=dict(
title_text="Latitude",
titlefont=dict(size=20),
)
)
fig.show()
program_end = t.time() - program_start
elapsed = round(program_end, 2)
print("Total time for program execution is {} seconds".format(elapsed))